In [1]:
#import libraries
import numpy as np
import pandas as pd
import os
In [2]:
#read weather dataframe
weather = pd.read_csv(os.path.join("data", "weather.csv"), na_values=["M", "-", "*"])
In [3]:
for column in weather.columns:
if sum(weather[column].isnull()) > 0:
print column
In [4]:
#Fill T values for snowfall with 0.01, meaning something is falling down but not that much
def change_snowfall(x):
if x == " T":
return 0.01
else:
return float(x)
weather["snowfall"] = weather["snowfall"].map(change_snowfall)
In [5]:
def change_preciptotal(x):
if x == " T":
return 0.01
else:
return float(x)
weather["preciptotal"] = weather["preciptotal"].map(change_preciptotal)
In [6]:
#separate codesum column into separate binary columns
codesum_columns = set(' '.join(set(weather["codesum"])).strip().split())
codesum = pd.DataFrame(index=weather.index, columns=codesum_columns)
In [7]:
for column in codesum.columns:
for i in range(len(weather.index)):
if column in weather["codesum"][i]:
codesum[column][i] = 1
In [8]:
#drop initial codesum column
weather = weather.drop("codesum", 1)
In [9]:
weather = weather.join(codesum.fillna(0))
In [10]:
weather.head()
Out[10]:
In [11]:
def diff_dates_2015(date_x):
date_format = "%m/%d/%Y"
x = datetime.strptime(date_x, date_format)
y = datetime.strptime('01/01/2015', date_format)
delta = y - x
return delta.days
I am adding column about number of days since beginning of the corresponding year
In [12]:
from datetime import datetime
def get_days(date_x):
date_format = "%Y-%m-%d"
x = datetime.strptime(date_x, date_format)
y = datetime.strptime('{year}-01-01'.format(year=x.year), date_format)
delta = x - y
return delta.days
In [13]:
weather['days'] = weather['date'].map(get_days)
In [14]:
def in_minutes(x):
if np.isnan(x):
return np.nan
else:
temp = int(x)
if (str(temp)[-2:]) == '60':
temp -= 1
b = datetime.strptime(str(temp), "%H%M")
a = datetime.strptime('0000', "%H%M")
return (b - a).total_seconds() / 60
In [15]:
weather["sunrise"] = weather['sunrise'].apply(in_minutes)
In [16]:
weather["sunset"] = weather['sunset'].apply(in_minutes)
In [17]:
weather.info()
In [18]:
#save to file
weather.to_csv(os.path.join("data", "weather_modified_4.csv"), index=False)
In [ ]: